Image Captioning Project Posted on 2019-07-11 | In Artificial Intelligence , Deep Learning | | In this project you will define and train an image-to-caption model, that can produce descriptions for real world images! Image Captioning Final ProjectIn this final project you will define and train an image-to-caption model, that can produce descriptions for real world images! Model architecture: CNN encoder and RNN decoder. () Import stuff In [2]: import sys sys.path.append("..") import grading import download_utils In [3]: download_utils.link_all_keras_resources() In [4]: import tensorflow as tf from tensorflow.contrib import keras import numpy as np %matplotlib inline import matplotlib.pyplot as plt L = keras.layers K = keras.backend import utils import time import zipfile import json from collections import defaultdict import re import random from random import choice import grading_utils import os from keras_utils import reset_tf_session import tqdm_utils Using TensorFlow backend. Prepare the storage for model checkpoints In [5]: # Leave USE_GOOGLE_DRIVE = False if you're running locally! # We recommend to set USE_GOOGLE_DRIVE = True in Google Colab! # If set to True, we will mount Google Drive, so that you can restore your checkpoint # and continue trainig even if your previous Colab session dies. # If set to True, follow on-screen instructions to access Google Drive (you must have a Google account). USE_GOOGLE_DRIVE = False def mount_google_drive(): from google.colab import drive mount_directory = "/content/gdrive" drive.mount(mount_directory) drive_root = mount_directory + "/" + list(filter(lambda x: x[0] != '.', os.listdir(mount_directory)))[0] + "/colab" return drive_root CHECKPOINT_ROOT = "" if USE_GOOGLE_DRIVE: CHECKPOINT_ROOT = mount_google_drive() + "/" def get_checkpoint_path(epoch=None): if epoch is None: return os.path.abspath(CHECKPOINT_ROOT + "weights") else: return os.path.abspath(CHECKPOINT_ROOT + "weights_{}".format(epoch)) # example of checkpoint dir print(get_checkpoint_path(10)) /root/intro-to-dl/week6/weights_10 Fill in your Coursera token and emailTo successfully submit your answers to our grader, please fill in your Coursera submission token and email In [6]: grader = grading.Grader(assignment_key="NEDBg6CgEee8nQ6uE8a7OA", all_parts=["19Wpv", "uJh73", "yiJkt", "rbpnH", "E2OIL", "YJR7z"]) In [222]: # token expires every 30 min COURSERA_TOKEN = "" COURSERA_EMAIL = "" Download dataTakes 10 hours and 20 GB. We've downloaded necessary files for you. Relevant links (just in case): train images validation images captions for both train and validation In [7]: # we downloaded them for you, just link them here download_utils.link_week_6_resources() Extract image featuresWe will use pre-trained InceptionV3 model for CNN encoder () and extract its last hidden layer as an embedding: In [8]: IMG_SIZE = 299 In [9]: # we take the last hidden layer of IncetionV3 as an image embedding def get_cnn_encoder(): K.set_learning_phase(False) model = keras.applications.InceptionV3(include_top=False) preprocess_for_model = keras.applications.inception_v3.preprocess_input model = keras.models.Model(model.inputs, keras.layers.GlobalAveragePooling2D()(model.output)) return model, preprocess_for_model Features extraction takes too much time on CPU: Takes 16 minutes on GPU. 25x slower (InceptionV3) on CPU and takes 7 hours. 10x slower (MobileNet) on CPU and takes 3 hours. So we've done it for you with the following code: # load pre-trained model reset_tf_session() encoder, preprocess_for_model = get_cnn_encoder() # extract train features train_img_embeds, train_img_fns = utils.apply_model( "train2014.zip", encoder, preprocess_for_model, input_shape=(IMG_SIZE, IMG_SIZE)) utils.save_pickle(train_img_embeds, "train_img_embeds.pickle") utils.save_pickle(train_img_fns, "train_img_fns.pickle") # extract validation features val_img_embeds, val_img_fns = utils.apply_model( "val2014.zip", encoder, preprocess_for_model, input_shape=(IMG_SIZE, IMG_SIZE)) utils.save_pickle(val_img_embeds, "val_img_embeds.pickle") utils.save_pickle(val_img_fns, "val_img_fns.pickle") # sample images for learners def sample_zip(fn_in, fn_out, rate=0.01, seed=42): np.random.seed(seed) with zipfile.ZipFile(fn_in) as fin, zipfile.ZipFile(fn_out, "w") as fout: sampled = filter(lambda _: np.random.rand() < rate, fin.filelist) for zInfo in sampled: fout.writestr(zInfo, fin.read(zInfo)) sample_zip("train2014.zip", "train2014_sample.zip") sample_zip("val2014.zip", "val2014_sample.zip") In [10]: # load prepared embeddings train_img_embeds = utils.read_pickle("train_img_embeds.pickle") train_img_fns = utils.read_pickle("train_img_fns.pickle") val_img_embeds = utils.read_pickle("val_img_embeds.pickle") val_img_fns = utils.read_pickle("val_img_fns.pickle") # check shapes print(train_img_embeds.shape, len(train_img_fns)) print(val_img_embeds.shape, len(val_img_fns)) (82783, 2048) 82783 (40504, 2048) 40504 In [11]: # check prepared samples of images list(filter(lambda x: x.endswith("_sample.zip"), os.listdir("."))) Out[11]: ['val2014_sample.zip', 'train2014_sample.zip'] Extract captions for images In [12]: # extract captions from zip def get_captions_for_fns(fns, zip_fn, zip_json_path): zf = zipfile.ZipFile(zip_fn) j = json.loads(zf.read(zip_json_path).decode("utf8")) id_to_fn = {img["id"]: img["file_name"] for img in j["images"]} fn_to_caps = defaultdict(list) for cap in j['annotations']: fn_to_caps[id_to_fn[cap['image_id']]].append(cap['caption']) fn_to_caps = dict(fn_to_caps) return list(map(lambda x: fn_to_caps[x], fns)) train_captions = get_captions_for_fns(train_img_fns, "captions_train-val2014.zip", "annotations/captions_train2014.json") val_captions = get_captions_for_fns(val_img_fns, "captions_train-val2014.zip", "annotations/captions_val2014.json") # check shape print(len(train_img_fns), len(train_captions)) print(len(val_img_fns), len(val_captions)) 82783 82783 40504 40504 In [13]: # look at training example (each has 5 captions) def show_trainig_example(train_img_fns, train_captions, example_idx=0): """ You can change example_idx and see different images """ zf = zipfile.ZipFile("train2014_sample.zip") captions_by_file = dict(zip(train_img_fns, train_captions)) all_files = set(train_img_fns) found_files = list(filter(lambda x: x.filename.rsplit("/")[-1] in all_files, zf.filelist)) example = found_files[example_idx] img = utils.decode_image_from_buf(zf.read(example)) plt.imshow(utils.image_center_crop(img)) plt.title("\n".join(captions_by_file[example.filename.rsplit("/")[-1]])) plt.show() show_trainig_example(train_img_fns, train_captions, example_idx=142) Prepare captions for training In [14]: # preview captions data train_captions[:2] Out[14]: [['A long dirt road going through a forest.', 'A SCENE OF WATER AND A PATH WAY', 'A sandy path surrounded by trees leads to a beach.', 'Ocean view through a dirt road surrounded by a forested area. ', 'dirt path leading beneath barren trees to open plains'], ['A group of zebra standing next to each other.', 'This is an image of of zebras drinking', 'ZEBRAS AND BIRDS SHARING THE SAME WATERING HOLE', 'Zebras that are bent over and drinking water together.', 'a number of zebras drinking water near one another']] In [119]: from functools import reduce from collections import Counter In [128]: # special tokens PAD = "#PAD#" UNK = "#UNK#" START = "#START#" END = "#END#" # split sentence into tokens (split into lowercased words) def split_sentence(sentence): return list(filter(lambda x: len(x) > 0, re.split('\W+', sentence.lower()))) def generate_vocabulary(train_captions): """ Return {token: index} for all train tokens (words) that occur 5 times or more, `index` should be from 0 to N, where N is a number of unique tokens in the resulting dictionary. Use `split_sentence` function to split sentence into tokens. Also, add PAD (for batch padding), UNK (unknown, out of vocabulary), START (start of sentence) and END (end of sentence) tokens into the vocabulary. """ counter = Counter() for captions in train_captions: for sentence in captions: counter.update(split_sentence(sentence)) vocab = {item for item,count in counter.items() if count >= 5} vocab = vocab.union({PAD,UNK,START,END}) return {token: index for index, token in enumerate(sorted(vocab))} def caption_tokens_to_indices(captions, vocab): """ `captions` argument is an array of arrays: [ [ "image1 caption1", "image1 caption2", ... ], [ "image2 caption1", "image2 caption2", ... ], ... ] Use `split_sentence` function to split sentence into tokens. Replace all tokens with vocabulary indices, use UNK for unknown words (out of vocabulary). Add START and END tokens to start and end of each sentence respectively. For the example above you should produce the following: [ [ [vocab[START], vocab["image1"], vocab["caption1"], vocab[END]], [vocab[START], vocab["image1"], vocab["caption2"], vocab[END]], ... ], ... ] """ res = [] for img_captions in captions: img_indx_cations = [] for sentence in img_captions: indx_sentence = [vocab[START]] + [vocab.get(token,vocab[UNK]) for token in split_sentence(sentence)] + [vocab[END]] img_indx_cations.append(indx_sentence) res.append(img_indx_cations) return res In [129]: # prepare vocabulary vocab = generate_vocabulary(train_captions) vocab_inverse = {idx: w for w, idx in vocab.items()} print(len(vocab)) 8769 In [130]: # replace tokens with indices train_captions_indexed = caption_tokens_to_indices(train_captions, vocab) val_captions_indexed = caption_tokens_to_indices(val_captions, vocab) Captions have different length, but we need to batch them, that's why we will add PAD tokens so that all sentences have an equal length. We will crunch LSTM through all the tokens, but we will ignore padding tokens during loss calculation. In [132]: # we will use this during training def batch_captions_to_matrix(batch_captions, pad_idx, max_len=None): """ `batch_captions` is an array of arrays: [ [vocab[START], ..., vocab[END]], [vocab[START], ..., vocab[END]], ... ] Put vocabulary indexed captions into np.array of shape (len(batch_captions), columns), where "columns" is max(map(len, batch_captions)) when max_len is None and "columns" = min(max_len, max(map(len, batch_captions))) otherwise. Add padding with pad_idx where necessary. Input example: [[1, 2, 3], [4, 5]] Output example: np.array([[1, 2, 3], [4, 5, pad_idx]]) if max_len=None Output example: np.array([[1, 2], [4, 5]]) if max_len=2 Output example: np.array([[1, 2, 3], [4, 5, pad_idx]]) if max_len=100 Try to use numpy, we need this function to be fast! """ cap_max = max(map(len,batch_captions)) if not max_len: max_len = cap_max elif max_len < cap_max: max_len = max_len else: max_len = cap_max matrix = np.empty([len(batch_captions),max_len]) matrix.fill(pad_idx) for index,line in enumerate(batch_captions): matrix[index,:len(line)] = line[0:max_len] matrix[index,len(line):] = pad_idx return matrix In [133]: ## GRADED PART, DO NOT CHANGE! # Vocabulary creation grader.set_answer("19Wpv", grading_utils.test_vocab(vocab, PAD, UNK, START, END)) # Captions indexing grader.set_answer("uJh73", grading_utils.test_captions_indexing(train_captions_indexed, vocab, UNK)) # Captions batching grader.set_answer("yiJkt", grading_utils.test_captions_batching(batch_captions_to_matrix)) In [134]: # you can make submission with answers so far to check yourself at this stage grader.submit(COURSERA_EMAIL, COURSERA_TOKEN) Submitted to Coursera platform. See results on assignment page! In [99]: # make sure you use correct argument in caption_tokens_to_indices assert len(caption_tokens_to_indices(train_captions[:10], vocab)) == 10 assert len(caption_tokens_to_indices(train_captions[:5], vocab)) == 5 Training Define architecture Since our problem is to generate image captions, RNN text generator should be conditioned on image. The idea is to use image features as an initial state for RNN instead of zeros. Remember that you should transform image feature vector to RNN hidden state size by fully-connected layer and then pass it to RNN. During training we will feed ground truth tokens into the lstm to get predictions of next tokens. Notice that we don't need to feed last token (END) as input (): In [135]: IMG_EMBED_SIZE = train_img_embeds.shape[1] IMG_EMBED_BOTTLENECK = 120 WORD_EMBED_SIZE = 100 LSTM_UNITS = 300 LOGIT_BOTTLENECK = 120 pad_idx = vocab[PAD] In [139]: IMG_EMBED_SIZE,pad_idx,LOGIT_BOTTLENECK Out[139]: (2048, 1, 120) In [148]: # remember to reset your graph if you want to start building it from scratch! s = reset_tf_session() tf.set_random_seed(42) Here we define decoder graph. We use Keras layers where possible because we can use them in functional style with weights reuse like this: dense_layer = L.Dense(42, input_shape=(None, 100) activation='relu') a = tf.placeholder('float32', [None, 100]) b = tf.placeholder('float32', [None, 100]) dense_layer(a) # that's how we applied dense layer! dense_layer(b) # and again Here's a figure to help you with flattening in decoder: In [149]: class decoder: # [batch_size, IMG_EMBED_SIZE] of CNN image features img_embeds = tf.placeholder('float32', [None, IMG_EMBED_SIZE]) # [batch_size, time steps] of word ids sentences = tf.placeholder('int32', [None, None]) # we use bottleneck here to reduce the number of parameters # image embedding -> bottleneck img_embed_to_bottleneck = L.Dense(IMG_EMBED_BOTTLENECK, input_shape=(None, IMG_EMBED_SIZE), activation='elu') # image embedding bottleneck -> lstm initial state img_embed_bottleneck_to_h0 = L.Dense(LSTM_UNITS, input_shape=(None, IMG_EMBED_BOTTLENECK), activation='elu') # word -> embedding word_embed = L.Embedding(len(vocab), WORD_EMBED_SIZE) # lstm cell (from tensorflow) lstm = tf.nn.rnn_cell.LSTMCell(LSTM_UNITS) # we use bottleneck here to reduce model complexity # lstm output -> logits bottleneck token_logits_bottleneck = L.Dense(LOGIT_BOTTLENECK, input_shape=(None, LSTM_UNITS), activation="elu") # logits bottleneck -> logits for next token prediction token_logits = L.Dense(len(vocab), input_shape=(None, LOGIT_BOTTLENECK)) # initial lstm cell state of shape (None, LSTM_UNITS), # we need to condition it on `img_embeds` placeholder. c0 = h0 = img_embed_bottleneck_to_h0(img_embed_to_bottleneck(img_embeds)) # embed all tokens but the last for lstm input, # remember that L.Embedding is callable, # use `sentences` placeholder as input. word_embeds = word_embed(sentences[:,:-1]) # during training we use ground truth tokens `word_embeds` as context for next token prediction. # that means that we know all the inputs for our lstm and can get # all the hidden states with one tensorflow operation (tf.nn.dynamic_rnn). # `hidden_states` has a shape of [batch_size, time steps, LSTM_UNITS]. hidden_states, _ = tf.nn.dynamic_rnn(lstm, word_embeds, initial_state=tf.nn.rnn_cell.LSTMStateTuple(c0, h0)) # now we need to calculate token logits for all the hidden states # first, we reshape `hidden_states` to [-1, LSTM_UNITS] flat_hidden_states = tf.reshape(hidden_states, [-1, LSTM_UNITS]) # then, we calculate logits for next tokens using `token_logits_bottleneck` and `token_logits` layers flat_token_logits = token_logits(token_logits_bottleneck(flat_hidden_states)) # then, we flatten the ground truth token ids. # remember, that we predict next tokens for each time step, # use `sentences` placeholder. flat_ground_truth = tf.reshape(sentences[:, 1:], [-1]) # we need to know where we have real tokens (not padding) in `flat_ground_truth`, # we don't want to propagate the loss for padded output tokens, # fill `flat_loss_mask` with 1.0 for real tokens (not pad_idx) and 0.0 otherwise. flat_loss_mask = tf.not_equal(flat_ground_truth, pad_idx) # compute cross-entropy between `flat_ground_truth` and `flat_token_logits` predicted by lstm xent = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=flat_ground_truth, logits=flat_token_logits ) # compute average `xent` over tokens with nonzero `flat_loss_mask`. # we don't want to account misclassification of PAD tokens, because that doesn't make sense, # we have PAD tokens for batching purposes only! loss = tf.reduce_mean(tf.boolean_mask(xent, flat_loss_mask)) In [150]: # define optimizer operation to minimize the loss optimizer = tf.train.AdamOptimizer(learning_rate=0.001) train_step = optimizer.minimize(decoder.loss) # will be used to save/load network weights. # you need to reset your default graph and define it in the same way to be able to load the saved weights! saver = tf.train.Saver() # intialize all variables s.run(tf.global_variables_initializer()) /usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/gradients_impl.py:93: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory. "Converting sparse IndexedSlices to a dense Tensor of unknown shape. " In [151]: ## GRADED PART, DO NOT CHANGE! # Decoder shapes test grader.set_answer("rbpnH", grading_utils.test_decoder_shapes(decoder, IMG_EMBED_SIZE, vocab, s)) # Decoder random loss test grader.set_answer("E2OIL", grading_utils.test_random_decoder_loss(decoder, IMG_EMBED_SIZE, vocab, s)) In [153]: # you can make submission with answers so far to check yourself at this stage grader.submit(COURSERA_EMAIL, COURSERA_TOKEN) Submitted to Coursera platform. See results on assignment page! Training loopEvaluate train and validation metrics through training and log them. Ensure that loss decreases. In [154]: train_captions_indexed = np.array(train_captions_indexed) val_captions_indexed = np.array(val_captions_indexed) In [183]: # generate batch via random sampling of images and captions for them, # we use `max_len` parameter to control the length of the captions (truncating long captions) def generate_batch(images_embeddings, indexed_captions, batch_size, max_len=None): """ `images_embeddings` is a np.array of shape [number of images, IMG_EMBED_SIZE]. `indexed_captions` holds 5 vocabulary indexed captions for each image: [ [ [vocab[START], vocab["image1"], vocab["caption1"], vocab[END]], [vocab[START], vocab["image1"], vocab["caption2"], vocab[END]], ... ], ... ] Generate a random batch of size `batch_size`. Take random images and choose one random caption for each image. Remember to use `batch_captions_to_matrix` for padding and respect `max_len` parameter. Return feed dict {decoder.img_embeds: ..., decoder.sentences: ...}. """ indexs = [random.randint(0,len(images_embeddings)-1) for i in range(batch_size)] batch_image_embeddings = images_embeddings[indexs] batch_captions = [caption[np.random.randint(5)] for caption in indexed_captions[indexs]] batch_captions_matrix = batch_captions_to_matrix(batch_captions,pad_idx, max_len) return {decoder.img_embeds: batch_image_embeddings, decoder.sentences: batch_captions_matrix} In [184]: batch_size = 64 n_epochs = 12 n_batches_per_epoch = 1000 n_validation_batches = 100 # how many batches are used for validation after each epoch In [185]: # you can load trained weights here # uncomment the next line if you need to load weights # saver.restore(s, get_checkpoint_path(epoch=4)) Look at the training and validation loss, they should be decreasing! In [186]: train_img_embeds.shape,train_captions_indexed.shape Out[186]: ((82783, 2048), (82783,)) In [187]: # actual training loop MAX_LEN = 20 # truncate long captions to speed up training # to make training reproducible np.random.seed(42) random.seed(42) for epoch in range(n_epochs): train_loss = 0 pbar = tqdm_utils.tqdm_notebook_failsafe(range(n_batches_per_epoch)) counter = 0 for _ in pbar: train_loss += s.run([decoder.loss, train_step], generate_batch(train_img_embeds, train_captions_indexed, batch_size, MAX_LEN))[0] counter += 1 pbar.set_description("Training loss: %f" % (train_loss / counter)) train_loss /= n_batches_per_epoch val_loss = 0 for _ in range(n_validation_batches): val_loss += s.run(decoder.loss, generate_batch(val_img_embeds, val_captions_indexed, batch_size, MAX_LEN)) val_loss /= n_validation_batches print('Epoch: {}, train loss: {}, val loss: {}'.format(epoch, train_loss, val_loss)) # save weights after finishing epoch saver.save(s, get_checkpoint_path(epoch)) print("Finished!") Epoch: 0, train loss: 3.0007614777088167, val loss: 2.9724034023284913 Epoch: 1, train loss: 2.8531791372299193, val loss: 2.9006982970237734 Epoch: 2, train loss: 2.7954050121307374, val loss: 2.8111998438835144 Epoch: 3, train loss: 2.730731366157532, val loss: 2.750483591556549 Epoch: 4, train loss: 2.6690069699287413, val loss: 2.749560286998749 Epoch: 5, train loss: 2.633123325586319, val loss: 2.7148624300956725 Epoch: 6, train loss: 2.5939396080970765, val loss: 2.6811715364456177 Epoch: 7, train loss: 2.574599018335342, val loss: 2.6403690791130066 Epoch: 8, train loss: 2.546513616323471, val loss: 2.627152864933014 Epoch: 9, train loss: 2.5285718023777006, val loss: 2.6443107414245604 Epoch: 10, train loss: 2.4949201991558074, val loss: 2.6084690499305725 Epoch: 11, train loss: 2.478545124053955, val loss: 2.594680278301239 Finished! In [188]: ## GRADED PART, DO NOT CHANGE! # Validation loss grader.set_answer("YJR7z", grading_utils.test_validation_loss( decoder, s, generate_batch, val_img_embeds, val_captions_indexed)) In [190]: # you can make submission with answers so far to check yourself at this stage grader.submit(COURSERA_EMAIL, COURSERA_TOKEN) Submitted to Coursera platform. See results on assignment page! In [191]: # check that it's learnt something, outputs accuracy of next word prediction (should be around 0.5) from sklearn.metrics import accuracy_score, log_loss def decode_sentence(sentence_indices): return " ".join(list(map(vocab_inverse.get, sentence_indices))) def check_after_training(n_examples): fd = generate_batch(train_img_embeds, train_captions_indexed, batch_size) logits = decoder.flat_token_logits.eval(fd) truth = decoder.flat_ground_truth.eval(fd) mask = decoder.flat_loss_mask.eval(fd).astype(bool) print("Loss:", decoder.loss.eval(fd)) print("Accuracy:", accuracy_score(logits.argmax(axis=1)[mask], truth[mask])) for example_idx in range(n_examples): print("Example", example_idx) print("Predicted:", decode_sentence(logits.argmax(axis=1).reshape((batch_size, -1))[example_idx])) print("Truth:", decode_sentence(truth.reshape((batch_size, -1))[example_idx])) print("") check_after_training(3) Loss: 2.37412 Accuracy: 0.501388888889 Example 0 Predicted: a person flying flying a kite in a building of people #END# #END# #END# #END# #END# #END# #END# #END# #END# #END# Truth: a child is flying a kite near a group of buildings #END# #PAD# #PAD# #PAD# #PAD# #PAD# #PAD# #PAD# #PAD# #PAD# Example 1 Predicted: a person of a doing a skateboard in down ramp of a ramp #END# #END# #END# #END# #END# #END# #END# #END# Truth: a closeup of someone on a skateboard riding the edge of a ramp #END# #PAD# #PAD# #PAD# #PAD# #PAD# #PAD# #PAD# Example 2 Predicted: a bed with a bed and a on furniture #END# a wall #END# #END# #END# #END# #END# #END# #END# #END# #END# Truth: a bedroom with aqua walls and cutouts of rain on the wall #END# #PAD# #PAD# #PAD# #PAD# #PAD# #PAD# #PAD# #PAD# In [192]: # save last graph weights to file! saver.save(s, get_checkpoint_path()) Out[192]: '/root/intro-to-dl/week6/weights' Applying modelHere we construct a graph for our final model. It will work as follows: take an image as an input and embed it condition lstm on that embedding predict the next token given a START input token use predicted token as an input at next time step iterate until you predict an END token In [193]: class final_model: # CNN encoder encoder, preprocess_for_model = get_cnn_encoder() saver.restore(s, get_checkpoint_path()) # keras applications corrupt our graph, so we restore trained weights # containers for current lstm state lstm_c = tf.Variable(tf.zeros([1, LSTM_UNITS]), name="cell") lstm_h = tf.Variable(tf.zeros([1, LSTM_UNITS]), name="hidden") # input images input_images = tf.placeholder('float32', [1, IMG_SIZE, IMG_SIZE, 3], name='images') # get image embeddings img_embeds = encoder(input_images) # initialize lstm state conditioned on image init_c = init_h = decoder.img_embed_bottleneck_to_h0(decoder.img_embed_to_bottleneck(img_embeds)) init_lstm = tf.assign(lstm_c, init_c), tf.assign(lstm_h, init_h) # current word index current_word = tf.placeholder('int32', [1], name='current_input') # embedding for current word word_embed = decoder.word_embed(current_word) # apply lstm cell, get new lstm states new_c, new_h = decoder.lstm(word_embed, tf.nn.rnn_cell.LSTMStateTuple(lstm_c, lstm_h))[1] # compute logits for next token new_logits = decoder.token_logits(decoder.token_logits_bottleneck(new_h)) # compute probabilities for next token new_probs = tf.nn.softmax(new_logits) # `one_step` outputs probabilities of next token and updates lstm hidden state one_step = new_probs, tf.assign(lstm_c, new_c), tf.assign(lstm_h, new_h) INFO:tensorflow:Restoring parameters from /root/intro-to-dl/week6/weights In [194]: # look at how temperature works for probability distributions # for high temperature we have more uniform distribution _ = np.array([0.5, 0.4, 0.1]) for t in [0.01, 0.1, 1, 10, 100]: print(" ".join(map(str, _**(1/t) / np.sum(_**(1/t)))), "with temperature", t) 0.999999999796 2.03703597592e-10 1.26765059997e-70 with temperature 0.01 0.903037043325 0.0969628642039 9.24709932365e-08 with temperature 0.1 0.5 0.4 0.1 with temperature 1 0.353447726392 0.345648113606 0.300904160002 with temperature 10 0.335367280481 0.334619764349 0.33001295517 with temperature 100 In [195]: # this is an actual prediction loop def generate_caption(image, t=1, sample=False, max_len=20): """ Generate caption for given image. if `sample` is True, we will sample next token from predicted probability distribution. `t` is a temperature during that sampling, higher `t` causes more uniform-like distribution = more chaos. """ # condition lstm on the image s.run(final_model.init_lstm, {final_model.input_images: [image]}) # current caption # start with only START token caption = [vocab[START]] for _ in range(max_len): next_word_probs = s.run(final_model.one_step, {final_model.current_word: [caption[-1]]})[0] next_word_probs = next_word_probs.ravel() # apply temperature next_word_probs = next_word_probs**(1/t) / np.sum(next_word_probs**(1/t)) if sample: next_word = np.random.choice(range(len(vocab)), p=next_word_probs) else: next_word = np.argmax(next_word_probs) caption.append(next_word) if next_word == vocab[END]: break return list(map(vocab_inverse.get, caption)) In [196]: # look at validation prediction example def apply_model_to_image_raw_bytes(raw): img = utils.decode_image_from_buf(raw) fig = plt.figure(figsize=(7, 7)) plt.grid('off') plt.axis('off') plt.imshow(img) img = utils.crop_and_preprocess(img, (IMG_SIZE, IMG_SIZE), final_model.preprocess_for_model) print(' '.join(generate_caption(img)[1:-1])) plt.show() def show_valid_example(val_img_fns, example_idx=0): zf = zipfile.ZipFile("val2014_sample.zip") all_files = set(val_img_fns) found_files = list(filter(lambda x: x.filename.rsplit("/")[-1] in all_files, zf.filelist)) example = found_files[example_idx] apply_model_to_image_raw_bytes(zf.read(example)) show_valid_example(val_img_fns, example_idx=100) a baseball player is swinging his bat at a ball In [197]: # sample more images from validation for idx in np.random.choice(range(len(zipfile.ZipFile("val2014_sample.zip").filelist) - 1), 10): show_valid_example(val_img_fns, example_idx=idx) time.sleep(1) a bear is sitting on a rock in the water a train is parked on the tracks near a fence a group of people standing around a man in a room a young boy in a red shirt and a white shirt and a white shirt and a white shirt a city with many boats and a building a baseball player is swinging at a ball a baby elephant standing in a field with a tree in the background a group of cars driving down a street a bus is driving down the street with a bus a woman sitting at a table with a laptop You can download any image from the Internet and appply your model to it! In [205]: download_utils.download_file( "https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1562794330534&di=1e83ed1d91d6b45c6cc1faa4f144dce1&imgtype=0&src=http%3A%2F%2Fs6.sinaimg.cn%2Fmiddle%2F4c271807gabeaf405af25%26690", "dora1.jpg" ) apply_model_to_image_raw_bytes(open("dora1.jpg", "rb").read()) a man holding a cell phone in front of a store In [206]: download_utils.download_file( "https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1562794426849&di=fd0de4f3f602675fdbb2ae6b557a789a&imgtype=jpg&src=http%3A%2F%2Fwww.gaoxiaoa.cn%2Fuploads%2F2018%2F07%2F20%2Fy0rwjpcuuor2210.jpg", "dora2.jpg" ) apply_model_to_image_raw_bytes(open("dora2.jpg", "rb").read()) a man holding a pair of scissors in a store In [207]: download_utils.download_file( "https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1562794563144&di=2bc884423f1df82010cd5339f31950ed&imgtype=0&src=http%3A%2F%2Fi.gtimg.cn%2Fqqlive%2Fimg%2Fjpgcache%2Ffiles%2Fqqvideo%2Fhori%2F4%2F4h2fv8pu7lmkmp2.jpg", "dora2.jpg" ) apply_model_to_image_raw_bytes(open("dora2.jpg", "rb").read()) a person holding a kite in a parking lot In [212]: download_utils.download_file( "https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1562794944314&di=f8e405f3cef51839e36119df6bee30da&imgtype=0&src=http%3A%2F%2Fphotocdn.sohu.com%2F20160118%2Fmp55134365_1453120177203_10.jpeg", "li.jpg" ) apply_model_to_image_raw_bytes(open("li.jpg", "rb").read()) a man in a white shirt and a white shirt and a white shirt and a white shirt In [214]: download_utils.download_file( "https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1562795102654&di=6e991c1996a6cb8c3875c737270400ba&imgtype=0&src=http%3A%2F%2Fhnrb.hinews.cn%2Fresfile%2F2016-03-01%2F015%2F1860655_hnrbtp1_1456751616813_b.jpg", "li2.jpg" ) apply_model_to_image_raw_bytes(open("li2.jpg", "rb").read()) a man in a suit and tie standing in front of a microphone In [216]: download_utils.download_file( "https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1562797334376&di=74df4bee2022d1b0519a759e9e6ec12d&imgtype=0&src=http%3A%2F%2Fn.sinaimg.cn%2Fent%2Ftransform%2F20170703%2FYGH3-fyhskrq1913341.jpg", "test.jpg" ) apply_model_to_image_raw_bytes(open("test.jpg", "rb").read()) a man in a white shirt and tie standing next to a man In [217]: download_utils.download_file( "https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1562797392119&di=81c23cda7dbc08fa561a561dd806473e&imgtype=0&src=http%3A%2F%2Fwww.hinews.cn%2Fpic%2F0%2F16%2F62%2F10%2F16621075_026693.jpg", "test2.jpg" ) apply_model_to_image_raw_bytes(open("test2.jpg", "rb").read()) a giraffe is eating from a white plate In [218]: download_utils.download_file( "https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1562797392118&di=1848b6cb90b57b0fde208794abf22aa2&imgtype=0&src=http%3A%2F%2Fi0.sinaimg.cn%2Fdy%2Fcr%2F2014%2F0716%2F4150211456.jpg", "test3.jpg" ) apply_model_to_image_raw_bytes(open("test3.jpg", "rb").read()) a man is standing next to a statue of a statue In [219]: download_utils.download_file( "https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1562797392116&di=e68eb448b9247779f4624e10e9bf8b95&imgtype=0&src=http%3A%2F%2Fi0.hexunimg.cn%2F2016-08-22%2F185638157.jpg", "test4.jpg" ) apply_model_to_image_raw_bytes(open("test4.jpg", "rb").read()) a woman in a black jacket and a woman standing next to a woman In [220]: download_utils.download_file( "https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1562797392116&di=580a384f03823c0503be39144bfdd85b&imgtype=0&src=http%3A%2F%2Fn.sinaimg.cn%2Fent%2Ftransform%2F20161207%2FzAUH-fxyipxf7913222.jpg", "test5.jpg" ) apply_model_to_image_raw_bytes(open("test5.jpg", "rb").read()) a man and woman standing in a field with a kite In [221]: download_utils.download_file( "https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1562797392116&di=665cab330fb52ec6883f1398ec2c167c&imgtype=0&src=http%3A%2F%2Fgb.cri.cn%2Fmmsource%2Fimages%2F2015%2F05%2F25%2Fex20150525008.jpg", "test6.jpg" ) apply_model_to_image_raw_bytes(open("test6.jpg", "rb").read()) a man in a suit and tie standing in front of a building Now it's time to find 10 examples where your model works good and 10 examples where it fails! You can use images from validation set as follows: show_valid_example(val_img_fns, example_idx=...) You can use images from the Internet as follows: ! wget ... apply_model_to_image_raw_bytes(open("...", "rb").read()) If you use these functions, the output will be embedded into your notebook and will be visible during peer review! When you're done, download your noteboook using 'File' -> 'Download as' -> 'Notebook' and prepare that file for peer review! In [ ]: ### YOUR EXAMPLES HERE ### That's it! Congratulations, you've trained your image captioning model and now can produce captions for any picture from the Internet! " style="scrolling:no;"> Donate article here Donate WeChat Pay Alipay